#!/usr/bin/env python3


import urllib
import pickle
import pandas as pd
import string
import numpy as np
from nltk.corpus import stopwords
import math
import csv
import re
import random


###############################################################
###### GLOBALS
###############################################################

##### Modify here as needed
dataPath = '/gscratch/comdata/users/kaylea/taboo/processed_data/'
DEBUG = 0
#########

########################## these should not need to be modified
#### copied from /gscratch/comdata/users/kaylea/taboo/Step3Classify/helpers.py -- how do I make local libs in my projects????
my_stop = ['term', 'used', 'usually', 'particularly', 'etc', 'extremely', 'especially', 'one', 'en', 'something', 'often', 'synonym', 'like', 'etc.', 'person']                             
stop = my_stop + stopwords.words('english')
#coefsFile = str(dataPath + 'powerful_unigram_euph_coefs.tsv') ##coefs from model fit
coefsFile = str(dataPath + 'powerful_euph_coefs.tsv') ##coefs from model fit
pattern = re.compile("^redirect (.*)", re.IGNORECASE)

### uncomment for test run
if DEBUG:
	print("In debug mode.")
	titlesFile = str(dataPath + 'test_enwiki_current_excerpt.tsv')
	euphOut = str(dataPath + 'euphSample_test.tsv')
else:
	titlesFile = str(dataPath + 'enwiki_current_excerpt.tsv')
	euphOut = str(dataPath + 'euphSampleFixed.tsv')

ranList = []
tabList = []

## read in high value coefs

coefDF = pd.read_csv(coefsFile, sep='\t', header=0)
titleDF = pd.read_csv(titlesFile, sep='\t', header=0)

print(coefDF.head())
print(titleDF.head())
coefs = coefDF.names.to_list()


titleDF['title'] = titleDF['encodedTitle'].apply(lambda x: urllib.parse.unquote(x))  
titleDF['title'] = titleDF['title'].apply(lambda x: x.strip('"'))
titleDF['filtered_title'] = titleDF['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
titleDF['filtered_title'] = titleDF['filtered_title'].str.lower()

euphDF = pd.DataFrame()

for c in coefs: #Round 1: find any direct title matches to high-value coefs
	matchDF = pd.DataFrame() #empty
	#print(f'checking coef {c}')
	matchDF = titleDF[titleDF['filtered_title'] == c] #just matches
	if not matchDF.empty:
		print(f"Found matches for {c}:")
		print(f"matchDF")
		euphDF = pd.concat([euphDF, matchDF]) 
	else:
		continue
print(f"Done checking {len(coefs)} coefs.")
if DEBUG:
        print(f"Preliminary set is: \n{len(euphDF)} long -- \nNow let's check for redirects in there.")


for index, row in euphDF.iterrows():
        try:
                if row['encodedTitle'] != row['target']:
                        print(f"Taboo item {row['encodedTitle']} is a redirect! Remap! Remap!")
                        goodTitle = row['target']
                else: #it was not a redirect target
                        print(f"Taboo item {row} is not a redirect, add it to the set and move on.")
                        goodTitle = row['encodedTitle'] #already good
                print(f"Now processing {goodTitle}")
                #try:
                #if titleDF['encodedTitle'].str.contains(goodTitle, regex=False):  #then we know it was recorded as safe
                #if goodTitle.casefold() in (title.casefold() for title in titleDF['encodedTitle'].values):  #case insensitive matcher
                if goodTitle in (title for title in titleDF['encodedTitle'].values):  #case sensitive matcher
                        print(f"Now adding {goodTitle} to final dataset.")
                        tabList.append(goodTitle)
                else:
                        print(f"I don't see {goodTitle} in the cleaned up titles list, can't keep it. Probably a disambig.")
                #except: 
                #        print(f"But wait! That would put a dab or other bad title in the sample! Dropping {goodTitle}.....")
                #        continue
        except:
                print(f"some kind of failure in {row}")
                continue


print(f"the list of articles is {tabList}")
tabList = list(set(tabList)) #uniques
euphDF_fixed = pd.DataFrame(tabList, columns=['encodedTitle'])
euphDF_fixed.to_csv(euphOut, mode='w', sep='\t', index=False)
